{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "GPU_id = 3\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# This workbook must be run using cudf 0.7"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "import cudf as gd\n",
    "import cupy as cp\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import xgboost as xgb\n",
    "import os\n",
    "import time\n",
    "import nvstrings\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'0.9.0'"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gd.__version__"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Global"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "GPU_RUN_TIME = {}\n",
    "STEPS = []"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def on_gpu(words,func,arg=None,dtype=np.int64):\n",
    "    \n",
    "    res = cp.array(words.size(), dtype=dtype)\n",
    "    if arg is None:\n",
    "        cmd = 'words.%s(res.device_ctypes_pointer.value)'%(func)\n",
    "    else:\n",
    "        cmd = 'words.%s(arg,res.device_ctypes_pointer.value)'%(func)\n",
    "    eval(cmd)\n",
    "    return res"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "if os.path.exists('cache')==False:\n",
    "    os.mkdir('cache')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = '/datasets/trivago/data'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "step = 'read csv'\n",
    "STEPS.append(step)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### cudf read csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train & test (15932992, 12) (3782335, 12)\n",
      "combined (19715327, 12)\n",
      "CPU times: user 2.96 s, sys: 2 s, total: 4.95 s\n",
      "Wall time: 9.02 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "start = time.time()\n",
    "train_gd = gd.read_csv('%s/train.csv'%path)\n",
    "test_gd = gd.read_csv('%s/test.csv'%path)\n",
    "submission_gd = gd.read_csv('%s/submission_popular.csv'%path)\n",
    "print(\"train & test\",train_gd.shape,test_gd.shape)\n",
    "data_gd = gd.concat([train_gd,test_gd])\n",
    "print('combined',data_gd.shape)\n",
    "GPU_RUN_TIME[step] = time.time() - start"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "del train_gd,test_gd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Only keep click out rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "step = 'string comparsion and masking'\n",
    "STEPS.append(step)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### cudf string comparsion and masking."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "# of clickouts: 2115365\n",
      "true test (253573, 13)\n",
      "253573\n",
      "true test shape match submission shape\n",
      "CPU times: user 4.05 s, sys: 2.15 s, total: 6.2 s\n",
      "Wall time: 10.5 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "start = time.time()\n",
    "action_type = nvstrings.from_strings(data_gd['action_type'].data)\n",
    "data_gd['is_click_out'] = action_type.compare('clickout item')\n",
    "del action_type\n",
    "data_gd['is_click_out'] = data_gd['is_click_out']==0 # 0 means string match\n",
    "data_gd['is_click_out'] = data_gd['is_click_out'].astype('bool')\n",
    "data_gd = data_gd[data_gd['is_click_out']]\n",
    "\n",
    "data_gd.drop_column('is_click_out')\n",
    "print(\"# of clickouts:\",data_gd.shape[0])\n",
    "data_gd['clickout_missing'] = data_gd['reference'].isnull()\n",
    "\n",
    "print('true test',data_gd[data_gd['clickout_missing']].shape)\n",
    "print(submission_gd.shape[0])\n",
    "assert submission_gd.shape[0] == data_gd[data_gd['clickout_missing']].shape[0]\n",
    "print('true test shape match submission shape')\n",
    "GPU_RUN_TIME[step] = time.time() - start"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 8 ms, sys: 4 ms, total: 12 ms\n",
      "Wall time: 12.8 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "data_gd['row_id'] = np.arange(data_gd.shape[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create recommendation list from `impressions`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "step = 'string column split & expand'\n",
    "STEPS.append(step)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### cudf string column split and expand"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 240 ms, sys: 324 ms, total: 564 ms\n",
      "Wall time: 2.08 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "start = time.time()\n",
    "# impressions/prices will be split into 25 columns\n",
    "candidates_gd = data_gd['impressions'].data.split('|')\n",
    "prices_gd = data_gd['prices'].data.split('|')\n",
    "GPU_RUN_TIME[step] = time.time() - start"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 4 ms, sys: 4 ms, total: 8 ms\n",
      "Wall time: 14.7 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "data_gd.drop_column('impressions')\n",
    "data_gd.drop_column('prices')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Assign string columns to dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "step = 'assign string columns to dataframe'\n",
    "STEPS.append(step)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 17.2 s, sys: 7.05 s, total: 24.2 s\n",
      "Wall time: 30.3 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "start = time.time()\n",
    "data_gd_rec_list = data_gd[['row_id']].to_pandas()\n",
    "for i in range(len(candidates_gd)):\n",
    "    data_gd_rec_list['item_%d'%i] = candidates_gd[i].to_host()\n",
    "    data_gd_rec_list['price_%d'%i] = prices_gd[i].to_host()\n",
    "data_gd_rec_list = data_gd_rec_list.set_index('row_id')\n",
    "GPU_RUN_TIME[step] = time.time() - start"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create data pair"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "step = 'create data pair'\n",
    "STEPS.append(step)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### cudf create data pair\n",
    "For functionalities that are not supported by cudf yet, such as `stack`, we fall back to pandas. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 50.8 s, sys: 13.5 s, total: 1min 4s\n",
      "Wall time: 1min 4s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "start = time.time()\n",
    "\n",
    "cols = [i for i in data_gd_rec_list.columns if i.startswith('item_')]\n",
    "items = data_gd_rec_list[cols].stack().reset_index()\n",
    "items.columns = ['row_id','candidate_order','item_id']\n",
    "\n",
    "cols = [i for i in data_gd_rec_list.columns if i.startswith('price_')]\n",
    "prices = data_gd_rec_list[cols].stack().reset_index()\n",
    "prices.columns = ['row_id','candidate_order','price']\n",
    "\n",
    "items['price'] = prices['price'].astype(int)\n",
    "items['candidate_order'] = items['candidate_order'].apply(lambda x:x.split('_')[1]).astype(int)\n",
    "\n",
    "count = items['row_id'].value_counts()\n",
    "items['row_id_count'] = items['row_id'].map(count)\n",
    "items = items[items['row_id_count']>1]\n",
    "items_gd = gd.from_pandas(items)\n",
    "\n",
    "data_gd['clickout_missing'] = data_gd['clickout_missing'].astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_pair_gd = items_gd.merge(data_gd,on='row_id',how='left')\n",
    "del data_gd, items_gd\n",
    "data_pair_gd['reference'] = data_pair_gd['reference'].astype(int)\n",
    "data_pair_gd['item_id'] = data_pair_gd['item_id'].astype(int)\n",
    "data_pair_gd['target'] = data_pair_gd['reference'] == data_pair_gd['item_id']\n",
    "data_pair_gd['target'] = data_pair_gd['target'].astype(int)\n",
    "\n",
    "GPU_RUN_TIME[step] = time.time() - start"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 4.74 s, sys: 2.34 s, total: 7.08 s\n",
      "Wall time: 9.22 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "test_pair_gd = data_pair_gd.loc[data_pair_gd.clickout_missing>0]\n",
    "test_pair_gd.to_parquet('../../cache/test/')\n",
    "del test_pair_gd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 200 ms, sys: 360 ms, total: 560 ms\n",
      "Wall time: 1.39 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "train_pair_gd = data_pair_gd.loc[data_pair_gd.clickout_missing==0]\n",
    "del data_pair_gd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_pair_gd['is_va'] = train_pair_gd.row_id%5 == 0\n",
    "train_pair = train_pair_gd[train_pair_gd.is_va==0]\n",
    "valid_pair = train_pair_gd[train_pair_gd.is_va>0]\n",
    "del train_pair_gd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_pair = train_pair.drop(['is_va'])\n",
    "valid_pair = valid_pair.drop(['is_va'])\n",
    "valid_pair.to_parquet('../../cache/valid/')\n",
    "train_pair.to_parquet('../../cache/train/')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Visualize the timing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'read csv': 9.023593664169312,\n",
       " 'string comparsion and masking': 10.48141098022461,\n",
       " 'string column split & expand': 2.0812671184539795,\n",
       " 'assign string columns to dataframe': 30.27687931060791,\n",
       " 'create data pair': 67.65250587463379}"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "GPU_RUN_TIME"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
